################################################################################
# Analyzing the Rhetoric of Supreme Court Confirmation Hearings 
# Jake S. Truscott
# Updated: October 2022
################################################################################

################################################################################
#Import Necessary Packages
################################################################################

import nltk 
import numpy as np
import pandas as pd
import json
import re                                  # library for regular expression operations
import string                              # for string operations
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
import string

################################################################################
#Load Training Data
################################################################################

sample_remarks = pd.read_csv(r"<PATH TO SCOTUS_CH_TRAIN CSV FILE>", encoding="ISO-8859-1")
sample_remarks = pd.DataFrame(sample_remarks)
for col in sample_remarks:
    print(col)
sample_remarks = sample_remarks.rename(columns = {"ï»¿statement_original" : "statement"})

remarks_output = sample_remarks['bayes_code']

all_positive = sample_remarks[sample_remarks['bayes_code'] == 1]
all_negative = sample_remarks[sample_remarks['bayes_code'] == -1]

frames = [all_positive,all_negative]

sample_remarks = pd.concat(frames)
sample_remarks['bayes_code'] = sample_remarks['bayes_code'].replace([-1], 0)


remarks = sample_remarks['statement']
remarks = remarks.to_list()
outputs =  sample_remarks['bayes_code']
outputs = outputs.to_list()



#See Number of Positive & Negative Remarks
print('Number of Total Remarks '), len(remarks)
print('Number of Total Outputs '), len(outputs)


################################################################################
#Data Cleaning
################################################################################


def remove_hyperlinks_marks_styles(tweet):
    
    # remove old style retweet text "RT"
    new_tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    new_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', new_tweet)

    # remove hashtags
    # only removing the hash # sign from the word
    new_tweet = re.sub(r'#', '', new_tweet)
    
    return new_tweet

#Tokenize String

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)

def tokenize_tweet(tweet):
    
    tweet_tokens = tokenizer.tokenize(tweet)
    
    return tweet_tokens

#Remove Stop Words and Punctuations

nltk.download('stopwords')

#Import the english stop words list from NLTK
stopwords_english = stopwords.words('english')

punctuations = string.punctuation

def remove_stopwords_punctuations(tweet_tokens):
    
    tweets_clean = []
    
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in punctuations):
            tweets_clean.append(word)
            
    return tweets_clean

#Stemming

stemmer = PorterStemmer()

def get_stem(tweets_clean):
    
    tweets_stem = []
    
    for word in tweets_clean:
        stem_word = stemmer.stem(word)
        tweets_stem.append(stem_word)
        
    return tweets_stem

#Combine all the pre-processing

def process_tweet(tweet):
    
    processed_tweet = remove_hyperlinks_marks_styles(tweet)
    tweet_tokens = tokenize_tweet(processed_tweet)
    tweets_clean = remove_stopwords_punctuations(tweet_tokens)
    #tweets_stem = get_stem(tweets_clean)
  # return tweets_stem
    return tweets_clean

################################################################################
#Split Datasets into Training and Testing
################################################################################

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(remarks, outputs, test_size = 0.5)

len(train_x)
len(test_x)
len(test_y)
len(train_y)

################################################################################
#Create Frequency Dictionary
################################################################################

def create_frequency(tweets, ys):
  freq_d = {}
  for tweet, y in zip(tweets,ys):
    for word in process_tweet(tweet):
      pair = (word,y)
      if pair in freq_d:
        freq_d[pair] += 1
      else:
        freq_d[pair] = freq_d.get(pair,1)
  return freq_d


################################################################################
#Train Model Using Naive Bayes
################################################################################

train_y = np.array(train_y)
train_x = np.array(train_x)
test_y = np.array(test_y)
test_x = np.array(test_x)

freqs = create_frequency(train_x, train_y)

def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    
    loglikelihood = {}
    logprior = 0
    
    # calculate the number of unique words in vocab
    unique_words = set([pair[0] for pair in freqs.keys()])
    V = len(unique_words)
    
    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        
        # TODO: get N_pos and N_get
        if pair[1] > 0:
          N_pos += freqs[(pair)]
        else:
          N_neg += freqs[(pair)]
      
            
    # TODO: calculate the number of documents (tweets)
    D = train_y.shape[0]
    
    # TODO: calculate D_pos, the number of positive documents (tweets)
    D_pos = sum(train_y)
    
    # TODO: calculate D_neg, the number of negative documents (tweets)
    D_neg = D - sum(train_y)
    
    # TODO: calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)
    
    # for each unqiue word
    for word in unique_words:
        
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word,1),0)
        freq_neg = freqs.get((word,0),0)
        
        # calculate the probability that word is positive, and negative
        p_w_pos = (freq_pos + 1)/ (N_pos + V)
        p_w_neg = (freq_neg + 1)/ (N_neg + V)
        
        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)
        
    return logprior, loglikelihood
  
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

def naive_bayes_predict(remark, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''

    # TODO: process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # TODO: initialize probability to zero
    p = 0

    # TODO: add the logprior
    p += logprior

    for word in word_l:

        # TODO: get log likelihood of each keyword
        if word in loglikelihood:
          p+= loglikelihood[word]

    return p

def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list

################################################################################
#Test Sample
# Run this cell to test your function
################################################################################

test_scores = pd.DataFrame()

for tweet in test_x:
  y_hat = naive_bayes_predict(tweet, logprior, loglikelihood)
  test_scores = test_scores.append({'naive_statement': tweet, 'naive_score': y_hat.T.astype(float)}, ignore_index=True)
  
robustness = test_scores  
robustness['naive_score'].values[robustness['naive_score'].values > 0] = 1
robustness['naive_score'].values[robustness['naive_score'].values < 0] = 0

test_scores = pd.DataFrame(test_x)
test_strings = pd.DataFrame(test_y)
test_scores["test_strings"] = test_strings
test_scores = test_scores.rename(columns = {0:  "naive_statement"})

accuracy = pd.merge(robustness, test_scores, on="naive_statement")
accuracy = accuracy.rename(columns ={'test_string': 'test_score'})

from sklearn.metrics import classification_report
target =["Negative",  "Positive"]

naive_score = accuracy['naive_score']
test_score = accuracy['test_strings']

print(classification_report(naive_score, test_score, target_names = target))

################################################################################
#Testing Against Real Data
#Replace path with necessary location
################################################################################

scotus_ch = pd.read_csv(r"<PATH TO TRUSCOTT_JLC_SCOTUS_CH CSV FILE>"")
scotus_ch.describe()
for col in scotus_ch:
    print(col)

scotus_statement = list(scotus_ch['statement_original'])

################################################################################
#Clean Testing Remarks
################################################################################

scotus_remarks = list() 
for remark in scotus_statement:
  t = process_tweet(remark)
  t = flatten_list(t)
  t = ' '.join(str(item) for item in t)
  scotus_remarks.append(t)
  

################################################################################
# Run Naive Bayes Classifier -- Export each output to a dataframe 
# Merge Classifications with Original Dataframe
# Export Results to local machine
################################################################################

nb_scores = pd.DataFrame()
for tweet in scotus_remarks:
  p = naive_bayes_predict(tweet, logprior, loglikelihood)
  nb_scores = nb_scores.append({'naive_statement': tweet, 'naive_score': p.T.astype(float)}, ignore_index=True)

#Merge Scores Back with original dataframe
scotus_ch_naive = pd.concat([scotus_ch, nb_scores], axis=1)

#Export as CSV
scotus_ch_naive.to_csv(r"<EXPORT FILE LOCATION TO LOCAL MACHINE>", index = False)




